Sumit (8977542)

Paras Rupani (8961758)

Asif Afzal (8764552)

Importing Modules¶

In [145]:
import numpy as np 
import pandas as pd 
import plotly
plotly.offline.init_notebook_mode()
In [146]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
In [147]:
data = pd.read_csv(r'./laptop.csv')
data.sample(3)
Out[147]:
Unnamed: 0 Company TypeName Inches ScreenResolution Cpu Ram Memory Gpu OpSys Weight Price
1272 1272 Lenovo Notebook 14.0 1366x768 Intel Celeron Dual Core N3050 1.6GHz 2GB 64GB Flash Storage Intel HD Graphics Windows 10 1.5kg 12201.12
86 86 HP Ultrabook 15.6 IPS Panel Full HD 1920x1080 Intel Core i7 8550U 1.8GHz 8GB 256GB SSD Nvidia GeForce GTX 940MX Windows 10 1.83kg 37242.72
140 140 Acer Notebook 17.3 IPS Panel Full HD 1920x1080 Intel Core i3 7130U 2.7GHz 4GB 1TB HDD Nvidia GeForce MX130 Windows 10 3kg 34898.40
In [148]:
data.columns
Out[148]:
Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')

Data Analysis¶

Which brand is the most frequent in the dataframe?¶

In [149]:
import plotly.express as px

# Assuming data is your DataFrame containing the data
company_counts = data['Company'].value_counts().reset_index()
company_counts.columns = ['Company', 'Count']

fig = px.bar(company_counts, x='Company', y='Count',
             labels={'Company': 'Company', 'Count': 'Number of Laptops'},
             color='Company',
             title='Number of laptops by brands')
fig.update_layout(xaxis={'tickangle': 80})
fig.show()

Most of laptops are from Dell, Lenovo and HP.

What type of laptop is the most frequent?¶

In [150]:
import plotly.express as px

# Assuming data is your DataFrame containing the data
type_counts = data['TypeName'].value_counts().reset_index()
type_counts.columns = ['TypeName', 'Count']

fig = px.pie(type_counts, values='Count', names='TypeName', title='Laptop types')
fig.show()
In [151]:
print('Most laptops are notebooks, which make {:.2f}% of the total laptops'.format(len(data[data['TypeName']=='Notebook'])*100/len(data)))
Most laptops are notebooks, which make 55.79% of the total laptops

Which size is the most popular?¶

In [152]:
import plotly.express as px

# Assuming data is your DataFrame containing the data
inch_counts = data['Inches'].value_counts().reset_index()
inch_counts.columns = ['Inches', 'Count']

# Plot the screen size distribution
fig = px.bar(inch_counts, x='Inches', y='Count', 
             labels={'Inches': 'Screen Size (inches)', 'Count': 'Number of Laptops'},
             title='Laptop Screen Size Distribution')

# Set a narrower range for the x-axis
fig.update_layout(xaxis=dict(title='Screen Size (inches)', tickangle=45, range=[min(inch_counts['Inches']) - 0.5, max(inch_counts['Inches']) + 0.5]),
                  yaxis=dict(title='Number of Laptops'))

fig.show()
In [153]:
print('Most laptops have 15.6 inches, which make {:.2f}% of the total laptops'.format(len(data[data['Inches']==15.6])*100/len(data)))
Most laptops have 15.6 inches, which make 51.04% of the total laptops

How is weight distributed among the laptops?¶

In [187]:
import plotly.express as px

def num_plot(data, column, title, unit):
    # Sort the DataFrame by the specified column
    sorted_data = data.sort_values(by=column)

    fig = px.histogram(sorted_data, x=column, 
                       labels={column: f'{column} ({unit})', 'count': 'Number of Laptops'},
                       title=title)
    fig.update_layout(xaxis={'title': f'{column} ({unit})'},
                      yaxis={'title': 'Number of Laptops'})
    fig.show()

# Assuming your DataFrame containing the data is named 'data'
num_plot(data, 'Weight', 'Weight Distribution', 'kg')
In [155]:
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

values = data['Memory'].value_counts()
filtered_ram = values[values > 60].index
filtered_df = data[data['Memory'].isin(filtered_ram)]

fig, ax = plt.subplots(figsize=(8, 5))
sns.countplot(x=filtered_df["Memory"], data=filtered_df, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_xlabel("Storage Options")
plt.title('Hard Drive Storage Distribution')
plt.show()
No description has been provided for this image

What is the average price of laptops by company?¶

In [156]:
import plotly.express as px

# Assuming data is your DataFrame containing the data
company_list = data['Company'].unique()

filtered_data = data[data['Company'].isin(company_list)]
average_price_by_company = filtered_data.groupby('Company')['Price'].mean().reset_index()

fig = px.bar(average_price_by_company, x='Company', y='Price', 
             title='Average price of laptops by company',
             labels={'Company': 'Company', 'Price': 'Average Price'},
             color='Company',
             color_discrete_sequence=px.colors.qualitative.Pastel,
             text='Price')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(xaxis={'tickangle': 80})
fig.show()

Data preprocessing¶

1. Removing column 'Unnamed: 0¶

In [157]:
data.drop(columns='Unnamed: 0', inplace=True)
data.sample(3)
Out[157]:
Company TypeName Inches ScreenResolution Cpu Ram Memory Gpu OpSys Weight Price
794 Apple Ultrabook 12.0 IPS Panel Retina Display 2304x1440 Intel Core M 1.2GHz 8GB 512GB Flash Storage Intel HD Graphics 5300 Mac OS X 0.920kg 62071.20
458 Microsoft Ultrabook 13.5 Touchscreen 2256x1504 Intel Core i7 7660U 2.5GHz 8GB 256GB SSD Intel Iris Plus Graphics 640 Windows 10 S 1.25kg 95850.72
47 Asus Gaming 17.3 Full HD 1920x1080 AMD Ryzen 1700 3GHz 8GB 256GB SSD + 1TB HDD AMD Radeon RX 580 Windows 10 3.2kg 69210.72

Checking for 'null values' & 'duplicate rows' in dataset¶

In [158]:
def count_percent(data):
    data_columns = pd.DataFrame({'Count Missing': data.isnull().sum(),
                                'Percent Missing': data.isnull().sum()*100/data.shape[0],
                                'Count Duplicate Rows': data.duplicated().sum(),
                                'Percent Duplicate Rows': data.duplicated().sum()*100/data.shape[0]
                                })
    return data_columns
count_percent(data) 
Out[158]:
Count Missing Percent Missing Count Duplicate Rows Percent Duplicate Rows
Company 0 0.0 29 2.225633
TypeName 0 0.0 29 2.225633
Inches 0 0.0 29 2.225633
ScreenResolution 0 0.0 29 2.225633
Cpu 0 0.0 29 2.225633
Ram 0 0.0 29 2.225633
Memory 0 0.0 29 2.225633
Gpu 0 0.0 29 2.225633
OpSys 0 0.0 29 2.225633
Weight 0 0.0 29 2.225633
Price 0 0.0 29 2.225633

There are 29 duplicate rows.

In [159]:
# count_non_duplicate_rows = data.duplicated(keep=False).sum() - data.duplicated().sum()
# print("Out of {0} rows, {1} rows are original and {2} are duplicate rows.".format(data.duplicated(keep=False).sum(),count_non_duplicate_rows,data.duplicated().sum()))

Dropping duplicate rows¶

In [160]:
data.drop_duplicates(inplace=True)
print("Duplicate Rows: ",data.duplicated().sum())
print(data.shape)
count_percent(data)
Duplicate Rows:  0
(1274, 11)
Out[160]:
Count Missing Percent Missing Count Duplicate Rows Percent Duplicate Rows
Company 0 0.0 0 0.0
TypeName 0 0.0 0 0.0
Inches 0 0.0 0 0.0
ScreenResolution 0 0.0 0 0.0
Cpu 0 0.0 0 0.0
Ram 0 0.0 0 0.0
Memory 0 0.0 0 0.0
Gpu 0 0.0 0 0.0
OpSys 0 0.0 0 0.0
Weight 0 0.0 0 0.0
Price 0 0.0 0 0.0
  • 'Unnamed: 0' column is removed.

  • We found 29 duplicated and rows and removed it.

  • Now there are no more 'duplicate rows' and 'null values' in the dataset.

using wishker approach to decide Budget , mid-range , expensive Laptop prices¶

In [161]:
min_price = data['Price'].min()
max_price = data['Price'].max()
median_price = data['Price'].median()

print("Minimum Price:", min_price)
print("Maximum Price:", max_price)
print("Median Price:", median_price)
Minimum Price: 9270.72
Maximum Price: 324954.72
Median Price: 52693.92

Selecting Budget Laptops, Mid-range, expensive Price Range¶

In [162]:
labels = ['budget', 'Mid-range', 'expensive']

# Split the 'Price' column into categories using quartiles and assign labels
data['Price_Category'] = pd.qcut(data['Price'], q=[0, 0.33, 0.66, 1], labels=labels)

# Display the first few rows of the DataFrame with the new 'Price_Category' column
print(data[['Price', 'Price_Category']].head())
         Price Price_Category
0   71378.6832      expensive
1   47895.5232      Mid-range
2   30636.0000         budget
3  135195.3360      expensive
4   96095.8080      expensive

Converting String Value Present in Ram, Memory, Weight into Numerical value¶

In [163]:
data['Ram'] = data['Ram'].str.extract('(\d+)').astype(float)
data['Memory'] = data['Memory'].str.extract('(\d+)').astype(float)
data['Weight'] = data['Weight'].str.extract('(\d+.\d+)').astype(float)
In [164]:
data.head()
Out[164]:
Company TypeName Inches ScreenResolution Cpu Ram Memory Gpu OpSys Weight Price Price_Category
0 Apple Ultrabook 13.3 IPS Panel Retina Display 2560x1600 Intel Core i5 2.3GHz 8.0 128.0 Intel Iris Plus Graphics 640 macOS 1.37 71378.6832 expensive
1 Apple Ultrabook 13.3 1440x900 Intel Core i5 1.8GHz 8.0 128.0 Intel HD Graphics 6000 macOS 1.34 47895.5232 Mid-range
2 HP Notebook 15.6 Full HD 1920x1080 Intel Core i5 7200U 2.5GHz 8.0 256.0 Intel HD Graphics 620 No OS 1.86 30636.0000 budget
3 Apple Ultrabook 15.4 IPS Panel Retina Display 2880x1800 Intel Core i7 2.7GHz 16.0 512.0 AMD Radeon Pro 455 macOS 1.83 135195.3360 expensive
4 Apple Ultrabook 13.3 IPS Panel Retina Display 2560x1600 Intel Core i5 3.1GHz 8.0 256.0 Intel Iris Plus Graphics 650 macOS 1.37 96095.8080 expensive

Checking null value in weight¶

In [165]:
null_weight = data['Weight'].isnull().sum()

# Display the number of null values
print("Number of null values in the 'Weight' column:", null_weight)
Number of null values in the 'Weight' column: 50
In [166]:
def analyze_outliers(rel_column):
    # Visualize the box plot
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=rel_column, data=data)
    plt.title(f'Boxplot of {rel_column}')
    plt.show()

    # Identify outliers using IQR method
    Q1 = data[rel_column].quantile(0.25)
    Q3 = data[rel_column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Count outliers
    outliers = data[(data[rel_column] < lower_bound) | (data[rel_column] > upper_bound)]
    num_outliers = outliers.shape[0]

    # Calculate summary statistics with and without outliers
    mean_with_outliers = data[rel_column].mean()
    std_dev_with_outliers = data[rel_column].std()

    mean_without_outliers = data[~data[rel_column].isin(outliers[rel_column])][rel_column].mean()
    std_dev_without_outliers = data[~data[rel_column].isin(outliers[rel_column])][rel_column].std()

    # Print results
    print("Number of outliers:", num_outliers)
    print("Mean with outliers:", mean_with_outliers)
    print("Standard deviation with outliers:", std_dev_with_outliers)
    print("Mean without outliers:", mean_without_outliers)
    print("Standard deviation without outliers:", std_dev_without_outliers)

analyze_outliers("Weight")
No description has been provided for this image
Number of outliers: 45
Mean with outliers: 2.0289787581699343
Standard deviation with outliers: 0.6727695661696617
Mean without outliers: 1.9458354537743854
Standard deviation without outliers: 0.5270787369766141

replacing null value in weight with mean¶

In [167]:
mean_weight = data['Weight'].mean()

data['Weight'].fillna(mean_weight, inplace=True)

printing unique value in TypeName¶

In [168]:
unique_type_names = data['TypeName'].unique()

# Display the unique values
print("Unique values in the 'TypeName' column:")
for value in unique_type_names:
    print(value)
Unique values in the 'TypeName' column:
Ultrabook
Notebook
Netbook
Gaming
2 in 1 Convertible
Workstation

categorical encoding¶

In [169]:
type_name_mapping = {
    'Ultrabook': 0,
    'Notebook': 1,
    'Netbook': 2,
    'Gaming': 3,
    '2 in 1 Convertible': 4,
    'Workstation': 5
}

data['TypeName'] = data['TypeName'].map(type_name_mapping)

Checking Dataset Before deciding Features and Labels¶

In [170]:
data.head()
Out[170]:
Company TypeName Inches ScreenResolution Cpu Ram Memory Gpu OpSys Weight Price Price_Category
0 Apple 0 13.3 IPS Panel Retina Display 2560x1600 Intel Core i5 2.3GHz 8.0 128.0 Intel Iris Plus Graphics 640 macOS 1.37 71378.6832 expensive
1 Apple 0 13.3 1440x900 Intel Core i5 1.8GHz 8.0 128.0 Intel HD Graphics 6000 macOS 1.34 47895.5232 Mid-range
2 HP 1 15.6 Full HD 1920x1080 Intel Core i5 7200U 2.5GHz 8.0 256.0 Intel HD Graphics 620 No OS 1.86 30636.0000 budget
3 Apple 0 15.4 IPS Panel Retina Display 2880x1800 Intel Core i7 2.7GHz 16.0 512.0 AMD Radeon Pro 455 macOS 1.83 135195.3360 expensive
4 Apple 0 13.3 IPS Panel Retina Display 2560x1600 Intel Core i5 3.1GHz 8.0 256.0 Intel Iris Plus Graphics 650 macOS 1.37 96095.8080 expensive

Correlation Matrix¶

  • The correlation matrix can give some useful informations about the linear correlation between the numerical features
In [171]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select the columns for correlation calculation
selected_columns = ['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']
selected_data = data[selected_columns]

plt.figure(figsize=(6, 5))
sns.heatmap(selected_data.corr(), cmap='RdBu', annot=True, vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()
No description has been provided for this image

RAM has a high positive correlation with Weight (+0.75):

  • Laptops having higher screen size tend to have an increase in weight.

Inches and Weight have a high positive correlation (+0.82):

  • As the screen size of the laptop increases, so does the price.
In [172]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

numeric_columns = data.select_dtypes(include=['number']).drop(columns=['Price','TypeName'])

# Calculate correlation of numeric columns with respect to 'Price' column
correlation_with_price = numeric_columns.corrwith(data['Price'])

# Plot the correlation values
plt.figure(figsize=(8, 6))
sns.barplot(x=correlation_with_price.index, y=correlation_with_price.values, palette="muted")
plt.title('Correlation with Price')
plt.xlabel('Features')
plt.ylabel('Correlation')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image

We can observe that the available RAM significantly contributes to the Price of the laptop.

  • Higher RAM would lead to an expensive laptop.
  • Lower RAM indicates an affordable laptop.

Define features and target¶

In [173]:
X = data[['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']]
y = data['Company']

Split the dataset into 80% training and 20% testing¶

In [174]:
from sklearn.model_selection import train_test_split

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)
Training set shape: (1019, 5) (1019,)
Testing set shape: (255, 5) (255,)

Model-1 (RandomForestClassifier)¶

In [175]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Print the training accuracy
training_accuracy = rf_classifier.score(X_train, y_train)
print("Training accuracy:", training_accuracy)
Training accuracy: 0.9018645731108931

Predictions of Company¶

In [176]:
# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Print the first few predictions
print("Predictions:", y_pred[:5])

# Print the actual values
print("Actual values:", y_test.values[:5])
Predictions: ['HP' 'Lenovo' 'Lenovo' 'Dell' 'HP']
Actual values: ['HP' 'Dell' 'Xiaomi' 'Dell' 'Dell']
In [177]:
# Print the feature names
print("Feature names:", X.columns.tolist())

# Print the unique values of the target variable
print("Unique company labels:", y.unique())
Feature names: ['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']
Unique company labels: ['Apple' 'HP' 'Acer' 'Asus' 'Dell' 'Lenovo' 'Chuwi' 'MSI' 'Microsoft'
 'Toshiba' 'Huawei' 'Xiaomi' 'Vero' 'Razer' 'Mediacom' 'Samsung' 'Google'
 'Fujitsu' 'LG']

Providing inputs for predictions¶

In [178]:
# Define the specifications for the laptop you want to predict
spec = [16, 512, 15.6, 3, 1]

spec = [spec]
company_pred = rf_classifier.predict(spec)
print("Predicted Company:", company_pred[0])
Predicted Company: Lenovo

Model-2 (LinearRegression)¶

Define features and target variable¶

In [179]:
X = data[['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']]
y = data['Price']
In [180]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import pickle

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Evaluate models using cross-validation
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
    print(f"{name}: MAE = {-scores.mean()} (±{scores.std()})")
Linear Regression: MAE = 17148.89697478473 (±777.1787229299255)
Decision Tree: MAE = 14358.04686958316 (±1395.4092472105663)
Random Forest: MAE = 12290.942519498396 (±933.7219958842466)
Gradient Boosting: MAE = 12941.940647067655 (±445.63436217606977)

Saving the Best Model¶

In [181]:
# Train the best model on the entire training set
best_model = RandomForestRegressor(random_state=42)
best_model.fit(X_train, y_train)

# Save the best model to a file
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

Predictions vs actual values vs company names¶

In [182]:
# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Select the first few predictions, actual values, and company names from the original dataset 'data'
predictions_df = pd.DataFrame({
    'Actual Price': y_test.values[:5],
    'Predicted Price': y_pred[:5],
    'Company': data.loc[X_test.index[:5], 'Company']  # Assuming 'Company' is the column name for company names in 'data'
})

# Print the DataFrame
print(predictions_df)
     Actual Price  Predicted Price Company
309    24988.3200     20353.739791      HP
622    41345.2800     24528.623802    Dell
184    63882.7200     63483.290052  Xiaomi
705    42357.6000     43016.424577    Dell
522    68184.0144     59898.614709    Dell

Random 5 Prediction for Prices¶

In [183]:
# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Print the first few predictions
print("Predictions:", y_pred[:5])

# Print the actual values
print("Actual values:", y_test.values[:5])
Predictions: ['HP' 'Lenovo' 'Lenovo' 'Dell' 'HP']
Actual values: [24988.32   41345.28   63882.72   42357.6    68184.0144]

selecting the specifications for the laptop you want to predict¶

In [184]:
# Define the specifications for the laptop you want to predict
spec = [[8, 128, 13.3, 1.37, 0]]

# Make predictions using the trained model
price_pred = best_model.predict(spec)

# Print the predicted price
print("Predicted Price:", round(price_pred[0], 2))
Predicted Price: 66922.73

Define the specifications for the laptop you want to predict based on budget, Mid-range, expensive¶

In [185]:
# Define the specifications for the laptop you want to predict
spec = [[16, 256, 13.3, 1.37, 0]]


price_pred = best_model.predict(spec)
print("Predicted Price:", round(price_pred[0], 2))


if price_pred[0] <= data['Price'].quantile(0.33):
    category = 'budget'
elif price_pred[0] >= data['Price'].quantile(0.66):
    category = 'expensive'
else:
    category = 'Mid-range'

# Print the price category
print("Price Category:", category)
Predicted Price: 96809.43
Price Category: expensive

Predictions with User Choice¶

In [186]:
import pandas as pd

# Define the feature names
feature_names = ['Ram', 'Memory', 'Inches', 'Weight', 'TypeName']

# Initialize an empty dictionary to store user-provided values
spec = {}

# Ask the user to provide values for each feature
for feature in feature_names:
    value = input("Enter value for {}: ".format(feature))
    spec[feature] = value

# Convert the user-provided values to the desired format
ram = int(spec['Ram'])
memory = int(spec['Memory'])
inches = float(spec['Inches'])
weight = float(spec['Weight'])
type_name = spec['TypeName']

# Construct the spec list using the user-provided values
spec = [[ram, memory, inches, weight, type_name]]

# Make predictions using the trained model
price_pred = best_model.predict(spec)

# Print the predicted price
print("Predicted Price:", round(price_pred[0], 2))

# Determine the price category
if price_pred[0] <= data['Price'].quantile(0.33):
    category = 'budget'
elif price_pred[0] >= data['Price'].quantile(0.66):
    category = 'expensive'
else:
    category = 'Mid-range'

# Print the price category
print("Price Category:", category)
Predicted Price: 66727.73
Price Category: Mid-range